library(data.table)
library(ggthemes)
library(tidyverse)
library(GGally)
library(plotly)
library(plotly)
library(mde)
library(corrplot)
library(viridis)
library(caTools)
theme_set(theme_few())
salary <- as_tibble(fread('NBA_season1718_salary.csv'))
ss <- as_tibble(fread('Seasons_Stats.csv'))
head(salary)
## # A tibble: 6 × 4
## V1 Player Tm season17_18
## <int> <chr> <chr> <dbl>
## 1 1 Stephen Curry GSW 34682550
## 2 2 LeBron James CLE 33285709
## 3 3 Paul Millsap DEN 31269231
## 4 4 Gordon Hayward BOS 29727900
## 5 5 Blake Griffin DET 29512900
## 6 6 Kyle Lowry TOR 28703704
summary(salary)
## V1 Player Tm season17_18
## Min. : 1 Length:573 Length:573 Min. : 17224
## 1st Qu.:144 Class :character Class :character 1st Qu.: 1312611
## Median :287 Mode :character Mode :character Median : 2386864
## Mean :287 Mean : 5858946
## 3rd Qu.:430 3rd Qu.: 7936509
## Max. :573 Max. :34682550
After viewing the head and summary, we have noticed several things a).colname “season17_18” should be changed to “salary”; b).there is no null values in the salary table; c).this table is already ranked based off salary.
head(ss)
## # A tibble: 6 × 53
## V1 Year Player Pos Age Tm G GS MP PER `TS%` `3PAr`
## <int> <int> <chr> <chr> <int> <chr> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 0 1950 Curly Arms… G-F 31 FTW 63 NA NA NA 0.368 NA
## 2 1 1950 Cliff Bark… SG 29 INO 49 NA NA NA 0.435 NA
## 3 2 1950 Leo Barnho… SF 25 CHS 67 NA NA NA 0.394 NA
## 4 3 1950 Ed Bartels F 24 TOT 15 NA NA NA 0.312 NA
## 5 4 1950 Ed Bartels F 24 DNN 13 NA NA NA 0.308 NA
## 6 5 1950 Ed Bartels F 24 NYK 2 NA NA NA 0.376 NA
## # … with 41 more variables: FTr <dbl>, `ORB%` <dbl>, `DRB%` <dbl>,
## # `TRB%` <dbl>, `AST%` <dbl>, `STL%` <dbl>, `BLK%` <dbl>, `TOV%` <dbl>,
## # `USG%` <dbl>, blanl <lgl>, OWS <dbl>, DWS <dbl>, WS <dbl>, `WS/48` <dbl>,
## # blank2 <lgl>, OBPM <dbl>, DBPM <dbl>, BPM <dbl>, VORP <dbl>, FG <int>,
## # FGA <int>, `FG%` <dbl>, `3P` <int>, `3PA` <int>, `3P%` <dbl>, `2P` <int>,
## # `2PA` <int>, `2P%` <dbl>, `eFG%` <dbl>, FT <int>, FTA <int>, `FT%` <dbl>,
## # ORB <int>, DRB <int>, TRB <int>, AST <int>, STL <int>, BLK <int>, …
# summary(ss)
After viewing the head and summary, we have noticed several things a).this table contains lots of columns as well as NA values. b). stat data ranges from 1950 to …. c). there were players who played for different teams within a season d). completely empty columns: blani/blank2 which should be dropped
ss17 <- ss %>% filter(Year==2017)
df <- merge(ss17, salary, by.x='Player',by.y = 'Player')
df <- df %>% select(
Player,
Age,
Team = Tm.y,
ORB,DRB,AST,STL,BLK,TOV,PTS,
G,GS,MP,Eff= PER,"TS%",FG,FGA,'FG%',"3P","3P%","2P","2PA","2P%",
Salary=season17_18)
#null summary
na_summary(df,sort_by = "percent_complete")
## variable missing complete percent_complete percent_missing
## 5 3P% 33 519 94.02174 5.9782609
## 2 2P% 3 549 99.45652 0.5434783
## 12 FG% 1 551 99.81884 0.1811594
## 24 TS% 1 551 99.81884 0.1811594
## 1 2P 0 552 100.00000 0.0000000
## 3 2PA 0 552 100.00000 0.0000000
## 4 3P 0 552 100.00000 0.0000000
## 6 Age 0 552 100.00000 0.0000000
## 7 AST 0 552 100.00000 0.0000000
## 8 BLK 0 552 100.00000 0.0000000
## 9 DRB 0 552 100.00000 0.0000000
## 10 Eff 0 552 100.00000 0.0000000
## 11 FG 0 552 100.00000 0.0000000
## 13 FGA 0 552 100.00000 0.0000000
## 14 G 0 552 100.00000 0.0000000
## 15 GS 0 552 100.00000 0.0000000
## 16 MP 0 552 100.00000 0.0000000
## 17 ORB 0 552 100.00000 0.0000000
## 18 Player 0 552 100.00000 0.0000000
## 19 PTS 0 552 100.00000 0.0000000
## 20 Salary 0 552 100.00000 0.0000000
## 21 STL 0 552 100.00000 0.0000000
## 22 Team 0 552 100.00000 0.0000000
## 23 TOV 0 552 100.00000 0.0000000
# we will just drop missing values because they weren't important
df <- na.omit(df)
nu_df <- df %>% select(where(is.numeric))
corrplot(cor(nu_df),method='pie')
cor_salary <- as.data.frame(cor(nu_df)[,'Salary'])
names(cor_salary) <- 'Salary'
cor_salary %>% arrange(desc(Salary))
## Salary
## Salary 1.00000000
## PTS 0.71886201
## FG 0.71647169
## FGA 0.69445520
## 2P 0.68189697
## 2PA 0.67212484
## GS 0.67033193
## MP 0.65208864
## DRB 0.63334474
## TOV 0.63074911
## STL 0.57254137
## AST 0.53944238
## Eff 0.52858248
## 3P 0.50025037
## ORB 0.45689948
## BLK 0.45180786
## G 0.42357945
## TS% 0.25298405
## Age 0.19570007
## FG% 0.18930759
## 2P% 0.07659004
## 3P% 0.05295982
since PTS is most correlated with salary, let’s look closely how the graph will look like
s_p <- ggplot(df,aes(Salary/1000000,PTS,color=Player)) + geom_point(alpha=0.5) + labs(x='Salary(million)', y='Total Points Scored', title='Salary vs Point') +theme(plot.title = element_text(hjust=.5,size=14,face='bold'),legend.position = "none") + scale_y_continuous(breaks=seq(0,3000,500))+scale_x_continuous(breaks=seq(0,40,5))
ggplotly(s_p)
You can see that in general, pts and salary are positively correlated: if you scored more, you will get paid more.
The National Basketball Association today announced that the Salary Cap has been set at $99.093 million for the 2017-18 season.
df %>% group_by(Team) %>% summarize(ts = sum(Salary)) %>%
ggplot(aes(reorder(Team,ts),ts/1000000)) + geom_col(color='black',fill=' tomato') + coord_flip()+labs(x='Total Salary(million)',y=NULL,title='Total Salary per Team') + theme(plot.title = element_text(size=14, hjust=.5)) + geom_hline(yintercept = 99.093,linetype='dotted',color='blue',size=1.5)
We can see that nearly 2/3 of NBA teams’ salary are above the Salary Cap
99.093 million. It might be interesting to acquire team winning data and
further compare them with total salary to see if they are
correlated.
sample <- sample.split(df$Salary,.7)
train <- subset(df, sample==T)
test <- subset(df, sample==F)
model <- lm(Salary~.,train[,c('Salary',"ORB","DRB","AST","STL","BLK","TOV","PTS")])
summary(model)
##
## Call:
## lm(formula = Salary ~ ., data = train[, c("Salary", "ORB", "DRB",
## "AST", "STL", "BLK", "TOV", "PTS")])
##
## Residuals:
## Min 1Q Median 3Q Max
## -13962098 -2568200 -182771 1875066 20374075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3130 431957 -0.007 0.994222
## ORB 11191 9890 1.132 0.258583
## DRB 9161 5055 1.812 0.070789 .
## AST 19621 5010 3.917 0.000108 ***
## STL -15780 14745 -1.070 0.285264
## BLK 12400 14271 0.869 0.385519
## TOV -51110 15491 -3.299 0.001068 **
## PTS 11240 1385 8.113 8.25e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4850000 on 353 degrees of freedom
## Multiple R-squared: 0.5635, Adjusted R-squared: 0.5548
## F-statistic: 65.09 on 7 and 353 DF, p-value: < 2.2e-16
names(df)
## [1] "Player" "Age" "Team" "ORB" "DRB" "AST" "STL" "BLK"
## [9] "TOV" "PTS" "G" "GS" "MP" "Eff" "TS%" "FG"
## [17] "FGA" "FG%" "3P" "3P%" "2P" "2PA" "2P%" "Salary"
We only limited our predictors to ORB : PTS as these data are most readily available and we can acquire them online to predict a player’s next season salary. * From the summary table, we can see that PTS,TOV,BLK,AST are four most important factors when it comes to NBA salary prediction.
res <- as.data.frame(residuals(model))
names(res) <- "residual"
ggplot(res,aes(residual)) + geom_histogram(fill='tomato',color='black')
The residual histogram is normally distributed
Salary.predictions <- predict(model,test)
results <- cbind(Salary.predictions,test$Salary)
colnames(results) <- c('pred','actual')
results <- as.data.frame(results)
results
## pred actual
## 1 595240.34 1312611
## 2 3081317.68 2116955
## 3 13276742.12 5504420
## 10 8157402.37 4187599
## 11 926476.30 778668
## 14 8182791.44 11000000
## 17 10797863.75 14814815
## 21 1864238.79 2328652
## 22 5003023.52 1312611
## 27 26476917.68 23775506
## 28 5847539.93 3290000
## 29 5847539.93 2000000
## 39 3556885.13 25000
## 49 2756381.83 5179760
## 52 8502511.91 119010
## 54 5454389.13 13618750
## 55 2540512.09 119602
## 59 693029.18 50000
## 70 3085413.09 3675480
## 71 8990210.11 3675480
## 74 3184058.41 333334
## 75 1347657.64 2203440
## 76 1679161.40 2203440
## 88 472998.97 1471382
## 89 510325.95 1471382
## 97 9370375.28 7630000
## 101 8929209.56 5562360
## 103 23931424.68 26153057
## 105 13996221.28 20559599
## 107 6544705.06 2300000
## 108 3824153.95 4992385
## 116 951661.72 26773
## 118 951661.72 1577230
## 119 125449.58 26773
## 121 125449.58 1577230
## 124 1080241.53 1577230
## 126 802827.88 1312611
## 127 1706091.48 1645200
## 128 21800833.78 27739975
## 134 208838.49 50000
## 135 208838.49 92858
## 136 13882812.40 15500000
## 139 8871708.83 5500000
## 140 6822717.66 5500000
## 141 6146172.21 12000000
## 143 12911638.83 2116955
## 144 3627577.09 104059
## 146 1783658.24 104059
## 148 5258046.47 4402546
## 154 10921201.80 5000000
## 161 14343746.54 16400000
## 164 11519975.14 15550000
## 165 11519975.14 2328652
## 170 11413788.37 20566802
## 173 3358311.48 6000000
## 175 9421863.37 6000000
## 178 12882788.52 502328
## 179 96353.22 6000000
## 182 7292043.94 17131148
## 187 77840.51 50000
## 195 5698821.27 1524305
## 200 11432182.25 17884176
## 201 17114108.59 23112004
## 209 1861426.09 53465
## 210 115586.40 1312611
## 211 23812483.95 6261395
## 214 3972526.34 1312611
## 221 654448.76 1312611
## 224 9116307.93 10942762
## 226 9758380.02 2262871
## 228 5081685.78 3028410
## 229 25335477.24 28299399
## 230 11068106.90 13954000
## 231 9603586.09 8533333
## 234 265900.42 17224
## 237 -22473.86 2328652
## 238 5323438.54 5225000
## 241 5343510.46 4956480
## 249 5875964.09 12000000
## 251 278902.68 9000000
## 261 354054.72 1312611
## 267 21316220.13 18063850
## 276 3009440.83 250000
## 290 3479089.97 1579440
## 291 6466363.98 1000000
## 296 7262002.62 2947305
## 298 1047914.99 100000
## 300 26847989.76 6216840
## 306 9119844.00 12921348
## 312 3994467.49 14100000
## 314 7042836.32 8393000
## 320 3477072.21 7000000
## 323 1007896.91 4666500
## 324 20623542.80 18868625
## 325 17218249.77 21461010
## 330 3593359.62 6655325
## 332 5715143.48 6666667
## 336 4435728.86 500000
## 338 9907133.44 7000000
## 341 4001701.10 2947305
## 342 4199568.25 1974159
## 353 179575.69 1312611
## 355 9808702.72 4538020
## 362 13037849.47 13168750
## 370 8991807.52 10162922
## 374 112615.95 500000
## 379 2338868.53 1662500
## 380 6327230.16 5000000
## 381 760698.36 1709538
## 390 5029057.62 3949999
## 393 3048472.01 4187599
## 395 685152.80 2328652
## 397 15595026.50 22434783
## 400 18765123.35 1471382
## 408 49883.71 2106470
## 411 15891865.32 24773250
## 412 2155163.67 7590035
## 421 12998521.46 16000000
## 422 16844546.93 19508958
## 426 118121.49 1709538
## 428 2831258.36 1709538
## 431 733592.79 100000
## 433 138629.40 100000
## 438 4001851.30 263124
## 440 1609903.96 1889040
## 442 1034921.16 1471382
## 444 2005501.44 2500000
## 445 8508105.35 16000000
## 446 5526283.17 2328652
## 452 8385729.61 2386864
## 457 5580350.44 8406000
## 459 29323623.24 28530608
## 464 8414394.20 172238
## 465 8414394.20 789725
## 467 3760161.22 20061729
## 468 11604431.65 20061729
## 472 5067424.38 7692308
## 473 3839538.78 6000000
## 477 4831926.64 6021175
## 484 8985640.85 1471382
## 485 11888223.91 3152931
## 487 10500517.90 14000000
## 488 8363995.72 14000000
## 490 2869074.78 2422560
## 492 8845499.87 10500000
## 500 4429550.54 15280000
## 502 16251509.77 16000000
## 505 8768795.39 15453126
## 518 663061.87 1471382
## 525 3172709.74 1709538
## 532 11135906.52 21000000
## 536 7872892.96 6270000
## 537 442621.26 1312611
## 546 13457617.24 12016854
## 548 5493246.82 1312611
## 550 9342471.70 3202217
SSE = sum((results$pred - results$actual)^2)
SST = sum( (mean(df$Salary) - results$actual)^2)
R2 = 1 - SSE/SST
R2
## [1] 0.504912
hm.. low R2 is not a good sign but let’s keep exploring
*Let’s predict Stephen Curry(the best player in the world:)) salary of next season
curry_stas <- data.frame(name = 'Stephen Curry', PTS = 1999,AST = 524,BLK=17,TOV=239 )
n_model <- lm(Salary~., df[,c('Salary','PTS','AST','BLK','TOV')])
outcome <- predict(n_model,curry_stas)
print(paste('pred:', outcome, 'real: 45780000'))
## [1] "pred: 21643871.0410019 real: 45780000"